• 1 Introduction and learning objectives
  • 2 Load data
  • 3 Visualize data
    • 3.1 Price Distribution
    • 3.2 Grouping the Houses by Whether they are old or new
    • 3.3 Grouping by Property Type
    • 3.4 Grouping by whether Freehold or Leasehold
  • 4 Correlation Plots
  • 5 Regression Analysis
    • 5.1 New Variables
    • 5.2 First Linear Regression Model
    • 5.3 Second Linear Regression Model
    • 5.4 Third Linear Regression Model
    • 5.5 Fourth Linear Regression Model
    • 5.6 Fit a tree model
    • 5.7 Gradient Boosting
    • 5.8 Least Absolute Shrinkage and Selection Operator
    • 5.9 Random Forest
    • 5.10 Comparing performance of all the created models
    • 5.11 Stacking
  • 6 Visualising results
  • 7 Pick investments

1 Introduction and learning objectives

2 Load data

The most important part of conducting any analysis is ensuring that the dataset used to structure hypotheses and arrive at conclusions is the most optimal dataset for the problem statement. I begin the exploratory data analysis by performing several data cleaning processes such as converting character variables into factors and ensuring that the date variable is in the date datatype.

#read in the data

london_house_prices_2019_training<-read.csv("training_data_assignment_with_prices.csv")
london_house_prices_2019_out_of_sample<-read.csv("test_data_assignment.csv")



#fix data types in both data sets

#fix dates
london_house_prices_2019_training <- london_house_prices_2019_training %>% mutate(date=as.Date(date))
london_house_prices_2019_out_of_sample<-london_house_prices_2019_out_of_sample %>% mutate(date=as.Date(date))
#change characters to factors
london_house_prices_2019_training <- london_house_prices_2019_training %>% mutate_if(is.character,as.factor)
london_house_prices_2019_out_of_sample<-london_house_prices_2019_out_of_sample %>% mutate_if(is.character,as.factor)

#making sure out of sample data and training data have the same number of factors
a<-union(levels(london_house_prices_2019_training$postcode_short),levels(london_house_prices_2019_out_of_sample$postcode_short))
london_house_prices_2019_training$postcode_short<- factor(london_house_prices_2019_training$postcode_short,levels=a)
london_house_prices_2019_out_of_sample$postcode_short<-factor(london_house_prices_2019_out_of_sample$postcode_short, levels=a)

b<-union(levels(london_house_prices_2019_training$water_company),levels(london_house_prices_2019_out_of_sample$water_company))
london_house_prices_2019_training$water_company<- factor(london_house_prices_2019_training$water_company,levels=b)
london_house_prices_2019_out_of_sample$water_company<-factor(london_house_prices_2019_out_of_sample$water_company, levels=b)

c<-union(levels(london_house_prices_2019_training$nearest_station),levels(london_house_prices_2019_out_of_sample$nearest_station))
london_house_prices_2019_training$nearest_station<- factor(london_house_prices_2019_training$nearest_station,levels=c)
london_house_prices_2019_out_of_sample$nearest_station<-factor(london_house_prices_2019_out_of_sample$nearest_station, levels=c)

d<-union(levels(london_house_prices_2019_training$district),levels(london_house_prices_2019_out_of_sample$district))
london_house_prices_2019_training$district<- factor(london_house_prices_2019_training$district,levels=d)
london_house_prices_2019_out_of_sample$district<-factor(london_house_prices_2019_out_of_sample$district, levels=d)


#take a quick look at what's in the data
str(london_house_prices_2019_training)
## 'data.frame':    13998 obs. of  37 variables:
##  $ ID                          : int  2 3 4 5 7 8 9 10 11 12 ...
##  $ date                        : Date, format: "2019-11-01" "2019-08-08" ...
##  $ postcode                    : Factor w/ 12635 levels "BR1 1AB","BR1 1LR",..: 10897 11027 11264 2031 11241 11066 421 9594 9444 873 ...
##  $ property_type               : Factor w/ 4 levels "D","F","S","T": 2 2 3 2 3 2 1 4 4 2 ...
##  $ whether_old_or_new          : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ freehold_or_leasehold       : Factor w/ 2 levels "F","L": 2 2 1 2 1 2 1 1 1 2 ...
##  $ address1                    : Factor w/ 2825 levels "1","1 - 2","1 - 3",..: 2503 792 253 789 569 234 264 418 5 274 ...
##  $ address2                    : Factor w/ 434 levels "1","10","101",..: 372 NA NA NA NA NA NA NA NA NA ...
##  $ address3                    : Factor w/ 8543 levels "ABBERTON WALK",..: 6990 6821 3715 2492 4168 2879 3620 5251 6045 6892 ...
##  $ town                        : Factor w/ 133 levels "ABBEY WOOD","ACTON",..: NA NA NA 78 NA NA NA NA NA NA ...
##  $ local_aut                   : Factor w/ 69 levels "ASHFORD","BARKING",..: 36 46 24 36 24 46 65 36 36 17 ...
##  $ county                      : Factor w/ 33 levels "BARKING AND DAGENHAM",..: 22 27 18 25 18 27 5 27 32 8 ...
##  $ postcode_short              : Factor w/ 248 levels "BR1","BR2","BR3",..: 190 194 198 28 198 194 4 169 167 8 ...
##  $ current_energy_rating       : Factor w/ 6 levels "B","C","D","E",..: 4 3 3 4 3 2 4 3 4 2 ...
##  $ total_floor_area            : num  30 50 100 39 88 101 136 148 186 65 ...
##  $ number_habitable_rooms      : int  2 2 5 2 4 4 6 6 6 3 ...
##  $ co2_emissions_current       : num  2.3 3 3.7 2.8 3.9 3.1 8.1 5.6 10 1.5 ...
##  $ co2_emissions_potential     : num  1.7 1.7 1.5 1.1 1.4 1.4 4.1 2 6.1 1.5 ...
##  $ energy_consumption_current  : int  463 313 212 374 251 175 339 216 308 128 ...
##  $ energy_consumption_potential: int  344 175 82 144 90 77 168 75 186 128 ...
##  $ windows_energy_eff          : Factor w/ 5 levels "Average","Good",..: 1 1 1 5 1 1 1 1 5 1 ...
##  $ tenure                      : Factor w/ 3 levels "owner-occupied",..: 1 2 1 2 1 1 1 2 1 1 ...
##  $ latitude                    : num  51.5 51.5 51.5 51.6 51.5 ...
##  $ longitude                   : num  -0.1229 -0.2828 -0.4315 0.0423 -0.4293 ...
##  $ population                  : int  34 75 83 211 73 51 25 91 60 97 ...
##  $ altitude                    : int  8 9 25 11 21 11 95 7 7 106 ...
##  $ london_zone                 : int  1 3 5 3 6 6 3 2 2 3 ...
##  $ nearest_station             : Factor w/ 594 levels "abbey road","abbey wood",..: 478 358 235 319 180 502 566 30 32 566 ...
##  $ water_company               : Factor w/ 5 levels "Affinity Water",..: 5 5 1 5 1 5 5 5 5 5 ...
##  $ average_income              : int  57200 61900 50600 45400 49000 56200 57200 65600 50400 52300 ...
##  $ district                    : Factor w/ 33 levels "Barking and Dagenham",..: 22 27 18 26 18 27 5 27 32 8 ...
##  $ price                       : num  360000 408500 499950 259999 395000 ...
##  $ type_of_closest_station     : Factor w/ 3 levels "light_rail","rail",..: 3 2 3 1 3 2 1 3 1 1 ...
##  $ num_tube_lines              : int  1 0 1 0 1 0 0 2 0 0 ...
##  $ num_rail_lines              : int  0 1 1 0 1 1 0 0 1 0 ...
##  $ num_light_rail_lines        : int  0 0 0 1 0 0 1 0 1 1 ...
##  $ distance_to_station         : num  0.528 0.77 0.853 0.29 1.073 ...
str(london_house_prices_2019_out_of_sample)
## 'data.frame':    1999 obs. of  37 variables:
##  $ ID                          : int  14434 12562 8866 10721 1057 1527 13961 12108 9363 1155 ...
##  $ date                        : Date, format: NA NA ...
##  $ postcode                    : logi  NA NA NA NA NA NA ...
##  $ property_type               : Factor w/ 4 levels "D","F","S","T": 1 2 2 3 4 3 2 3 2 4 ...
##  $ whether_old_or_new          : Factor w/ 2 levels "N","Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ freehold_or_leasehold       : Factor w/ 2 levels "F","L": 1 2 2 1 1 1 2 1 2 1 ...
##  $ address1                    : logi  NA NA NA NA NA NA ...
##  $ address2                    : logi  NA NA NA NA NA NA ...
##  $ address3                    : logi  NA NA NA NA NA NA ...
##  $ town                        : Factor w/ 54 levels "ACTON","ADDISCOMBE",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ local_aut                   : logi  NA NA NA NA NA NA ...
##  $ county                      : logi  NA NA NA NA NA NA ...
##  $ postcode_short              : Factor w/ 248 levels "BR1","BR2","BR3",..: 91 58 37 60 232 159 168 124 187 135 ...
##  $ current_energy_rating       : Factor w/ 6 levels "B","C","D","E",..: 3 2 3 3 4 4 4 3 4 3 ...
##  $ total_floor_area            : num  150 59 58 74 97.3 ...
##  $ number_habitable_rooms      : int  6 2 2 5 5 5 5 4 2 5 ...
##  $ co2_emissions_current       : num  7.3 1.5 2.8 3.5 6.5 4.9 5.1 2.9 4.2 4.3 ...
##  $ co2_emissions_potential     : num  2.4 1.4 1.2 1.2 5.7 1.6 3 0.8 3.2 2.5 ...
##  $ energy_consumption_current  : int  274 142 253 256 303 309 240 224 458 253 ...
##  $ energy_consumption_potential: int  89 136 110 80 266 101 140 58 357 143 ...
##  $ windows_energy_eff          : Factor w/ 5 levels "Average","Good",..: 1 1 1 1 1 1 3 1 3 1 ...
##  $ tenure                      : Factor w/ 3 levels "owner-occupied",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ latitude                    : num  51.6 51.6 51.5 51.6 51.5 ...
##  $ longitude                   : num  -0.129 -0.2966 -0.0328 -0.3744 -0.2576 ...
##  $ population                  : int  87 79 23 73 100 24 22 49 65 98 ...
##  $ altitude                    : int  63 38 17 39 8 46 26 16 14 18 ...
##  $ london_zone                 : int  4 4 2 5 2 4 3 6 1 3 ...
##  $ nearest_station             : Factor w/ 594 levels "abbey road","abbey wood",..: 19 546 214 362 516 168 24 519 148 251 ...
##  $ water_company               : Factor w/ 5 levels "Affinity Water",..: 5 1 5 1 5 5 5 2 5 5 ...
##  $ average_income              : int  61300 48900 46200 52200 60700 59600 64000 48100 56600 53500 ...
##  $ district                    : Factor w/ 33 levels "Barking and Dagenham",..: 10 4 30 15 18 11 32 16 20 23 ...
##  $ type_of_closest_station     : Factor w/ 3 levels "light_rail","rail",..: 3 3 1 2 3 2 3 3 3 2 ...
##  $ num_tube_lines              : int  1 2 0 0 2 0 1 1 2 0 ...
##  $ num_rail_lines              : int  0 1 0 1 0 1 1 0 0 1 ...
##  $ num_light_rail_lines        : int  0 1 1 0 0 0 0 1 0 0 ...
##  $ distance_to_station         : num  0.839 0.104 0.914 0.766 0.449 ...
##  $ asking_price                : num  750000 229000 152000 379000 930000 350000 688000 386000 534000 459000 ...

Upon confirming that the dataset is suitable, I set a seed to facilitate easier duplication of results, followed by splitting the dataset into a training set and a testing set. I use the training set to develop a model that can evaluate predictions correctly on unknown sample points, i.e, the testing set.

#let's do the initial split
library(rsample)
set.seed(69)
train_test_split <- initial_split(london_house_prices_2019_training, prop = 0.75) #training set contains 75% of the data
# Create the training dataset
train_data <- training(train_test_split)
test_data <- testing(train_test_split)

3 Visualize data

Before building any models, it is critical to analyse the distribution of the price of each house and its relationship with the several factors that possess the potential to impact its fluctuations.

3.1 Price Distribution

ggplot(data=train_data, aes(x=price)) + geom_histogram(aes(y=..density..)) + geom_density()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We can clearly see that this distribution actually does follow the 80:20 law. 80% of the investments come from the cheaper 20% of the houses and we can clearly see that there are very few people who actually purchase properties about 600K.

Above is a histogram combined with a density estimate of how many properties were sold at each prize point. Unsurprisingly, the distribution looks like it follows a power law. The easiest way to understand the power law distribution is to think of the famous 80:20 rule. In business this often translates to “80% of your revenue comes from 20% of your customers”. In the context of this dataset, you can make a very rough estimate that 80% of purchases were made in the lowest 20% of the price range.

ggplot(data=train_data, aes(x=log(price))) + geom_histogram(aes(y=..density..)) + geom_density()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

exp(13)
## [1] 442413.4

We can see that the log of the prices is normally distributed with a peak at 13 which translates to ~£450,000.

3.2 Grouping the Houses by Whether they are old or new

by_whether_old_or_new <- select(train_data, price, whether_old_or_new) %>%
  group_by(whether_old_or_new) %>%
  summarise(Count=n(), Mean.Price=mean(price), St.Dev.Price=sd(price), Median.Price=median(price))
## `summarise()` ungrouping output (override with `.groups` argument)
a <- ggplot(by_whether_old_or_new, aes(x=whether_old_or_new, y=Count, fill=whether_old_or_new)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

b <- ggplot(by_whether_old_or_new, aes(x=whether_old_or_new, y=Mean.Price, fill=whether_old_or_new)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

c <- ggplot(by_whether_old_or_new, aes(x=whether_old_or_new, y=St.Dev.Price, fill=whether_old_or_new)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

d <- ggplot(by_whether_old_or_new, aes(x=whether_old_or_new, y=Median.Price, fill=whether_old_or_new)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 


library(ggpubr)
ggarrange(a, b, c, d, 
          ncol = 2, nrow = 2, 
          common.legend = TRUE, legend = "bottom")

ggplot(train_data, aes(x=whether_old_or_new, y=price)) +
  stat_ydensity(trim = FALSE, aes(fill = whether_old_or_new)) +
  scale_y_log10(breaks=round(10^seq(3.6,8,0.2))) +
  coord_trans(y = "log10") +
  coord_flip() +
  
  theme(axis.text.x= element_text(angle=45, hjust=1))
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

It is quite unsurprising that most of the houses in London were not newly built. The median price suggests a small difference in price between newly built homes and older homes. However, we cannot actually base many conclusions on this because there are only 7 new houses in this dataset. Both the distributions seem quite similar and we can observe that 400K seems to be the most popular pricing point.

3.3 Grouping by Property Type

by_pt <- select(train_data, price, property_type) %>%
  group_by(property_type) %>% 
  summarise(Count= n(), Mean.Price=mean(price), St.Dev.Price=sd(price), Median.Price=median(price)) 
## `summarise()` ungrouping output (override with `.groups` argument)
a <- ggplot(by_pt, aes(x=property_type, y=Count, fill=property_type)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

b <- ggplot(by_pt, aes(x=property_type, y=Mean.Price, fill=property_type)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

c <- ggplot(by_pt, aes(x=property_type, y=St.Dev.Price, fill=property_type)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

d <- ggplot(by_pt, aes(x=property_type, y=Median.Price, fill=property_type)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 


library(ggpubr)
ggarrange(a, b, c, d, 
          ncol = 2, nrow = 2, 
          common.legend = TRUE, legend = "bottom")

The count variable showcases the number of purchases for each property type. The most purchased flats in London were Flats, followed by Terrace Houses. Whereas there are considerably lesser Semi-Detached and Detached Houses (which probably stems from the fact that these cost more and Detached Property Types are considerably pricier). Apart from detached houses, all the other property types have roughly similar prices. From the standard deviations, can see that Detached and Terraced houses vary considerably in price, followed by Flats, whereas Semi-Detached house prices are comparatively less spread out.

ggplot(train_data, aes(x=property_type, y=price)) +
  stat_ydensity(trim = FALSE, aes(fill = property_type)) +
  scale_y_log10(breaks=round(10^seq(3.6,8,0.2))) +
  coord_trans(y = "log10") +
  coord_flip() +
  theme(axis.text.x= element_text(angle=45, hjust=1))
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

The violin graphs allow us to compare the distribution of house prices for each property type. From the plot envisioned, we can see that the distributions for Terraced and Semi-Detached Houses are quite similar. Whereas, Flats are priced lower than the previous two and detached houses significantly higher than the others.

3.4 Grouping by whether Freehold or Leasehold

by_freehold_or_leasehold <- select(train_data, price, freehold_or_leasehold) %>%
  group_by(freehold_or_leasehold) %>% 
  summarise(Count= n(), Mean.Price=mean(price), St.Dev.Price=sd(price), Median.Price=median(price)) 
## `summarise()` ungrouping output (override with `.groups` argument)
by_freehold_or_leasehold
## # A tibble: 2 x 5
##   freehold_or_leasehold Count Mean.Price St.Dev.Price Median.Price
##   <fct>                 <int>      <dbl>        <dbl>        <dbl>
## 1 F                      6392    677725.      562651.       520000
## 2 L                      4107    469340.      416106.       375000
a <- ggplot(by_freehold_or_leasehold, aes(x=freehold_or_leasehold, y=Count, fill=freehold_or_leasehold)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

b <- ggplot(by_freehold_or_leasehold, aes(x=freehold_or_leasehold, y=Mean.Price, fill=freehold_or_leasehold)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

c <- ggplot(by_freehold_or_leasehold, aes(x=freehold_or_leasehold, y=St.Dev.Price, fill=freehold_or_leasehold)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 

d <- ggplot(by_freehold_or_leasehold, aes(x=freehold_or_leasehold, y=Median.Price, fill=freehold_or_leasehold)) +
  geom_bar(position="dodge", stat='identity', width=0.9) 


library(ggpubr)
ggarrange(a, b, c, d, 
          ncol = 2, nrow = 2, 
          common.legend = TRUE, legend = "bottom")

From the plot, we can clearly see that there were nearly 33% more freehold properties being purchased than leasehold properties.

ggplot(train_data, aes(x=freehold_or_leasehold, y=price)) +
  stat_ydensity(trim = FALSE, aes(fill = freehold_or_leasehold)) +
  scale_y_log10(breaks=round(10^seq(3.6,8,0.2))) +
  coord_trans(y = "log10") +
  coord_flip() +
  theme(axis.text.x= element_text(angle=45, hjust=1))
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

The price difference can be clearly visualised through the above plot. The leasehold peak is approximately at £400,000 whereas the leasehold peak is around £500,000. Therefore, there is definitely a correlation between the price and whether the property is a leasehold or a freehold. The more expensive leasehold properties may stem from the fact that most leasehold properties are in Central London.

4 Correlation Plots

# Insignificant correlations are leaved blank
library("GGally")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
london_house_prices_2019_training %>% 
  select(-ID) %>% #keep Y variable last
  ggcorr(method = c("pairwise", "pearson"), layout.exp = 2,label_round=2, label = TRUE,label_size = 2,hjust = 1,nbreaks = 5,size = 2,angle = -20)
## Warning in ggcorr(., method = c("pairwise", "pearson"), layout.exp = 2, :
## data in column(s) 'date', 'postcode', 'property_type', 'whether_old_or_new',
## 'freehold_or_leasehold', 'address1', 'address2', 'address3', 'town',
## 'local_aut', 'county', 'postcode_short', 'current_energy_rating',
## 'windows_energy_eff', 'tenure', 'nearest_station', 'water_company', 'district',
## 'type_of_closest_station' are not numeric and were ignored

From the correlation matrix, we can clearly observe that the price has a high positive correlation with the total floor area, number of inhabitable rooms, current CO2 emissions, potential CO2 emissions, and average income. These are the variables that I need to be careful of while including in my models due to the problem of Multicollinearity. Multicollinearity happens when one predictor variable in a multiple regression model can be linearly predicted from the others with a high degree of accuracy - this can lead to skewed or misleading results. Luckily, decision trees and boosted trees algorithms are immune to multicollinearity by nature.

5 Regression Analysis

5.1 New Variables

I will begin the regression analysis by creating a few new variables that I feel will be important in order to perform price prediction. The variables I create are: - total_population_per_zone: Population per zone - average_income_per_zone: Average Income Per Zone - average_distance_to_station_per_zone: Average Distance to Station Per Zone

library(rlist)
a <- sapply(train_data, function(x) length(unique(x)))
print(a)
##                           ID                         date 
##                        10499                          258 
##                     postcode                property_type 
##                         9743                            4 
##           whether_old_or_new        freehold_or_leasehold 
##                            2                            2 
##                     address1                     address2 
##                         2290                          364 
##                     address3                         town 
##                         7018                          123 
##                    local_aut                       county 
##                           69                           33 
##               postcode_short        current_energy_rating 
##                          245                            6 
##             total_floor_area       number_habitable_rooms 
##                         1670                           13 
##        co2_emissions_current      co2_emissions_potential 
##                          127                          109 
##   energy_consumption_current energy_consumption_potential 
##                          559                          463 
##           windows_energy_eff                       tenure 
##                            5                            3 
##                     latitude                    longitude 
##                         9529                         9627 
##                   population                     altitude 
##                          260                          162 
##                  london_zone              nearest_station 
##                            7                          581 
##                water_company               average_income 
##                            5                          331 
##                     district                        price 
##                           33                         1724 
##      type_of_closest_station               num_tube_lines 
##                            3                            7 
##               num_rail_lines         num_light_rail_lines 
##                            3                            2 
##          distance_to_station 
##                         2393
zone_counts1 <- list()
class(a["london_zone"])
## [1] "integer"
class(train_data[["london_zone"]])
## [1] "integer"
total_population_per_zone <- train_data %>% 
  group_by(london_zone) %>% 
  summarise(total_population_per_zone = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
train_data <- train_data %>% 
  left_join(total_population_per_zone, by = "london_zone")

average_income_per_zone <- train_data %>% 
  group_by(london_zone) %>%
  summarise(average_income_per_zone1 = mean(average_income))
## `summarise()` ungrouping output (override with `.groups` argument)
train_data <- train_data %>% 
  left_join(average_income_per_zone, by = "london_zone")

average_distance_to_station_per_zone <- train_data %>% 
  group_by(london_zone) %>%
  summarise(average_distance_to_station_per_zone = mean(distance_to_station))
## `summarise()` ungrouping output (override with `.groups` argument)
train_data <- train_data %>% 
  left_join(average_distance_to_station_per_zone, by = "london_zone")

5.2 First Linear Regression Model

#Define control variables
CVfolds=15
indexProbs <- createMultiFolds(train_data$price, CVfolds,times = 1) 
control <- trainControl(method = "cv",  
                        number = CVfolds, 
                        returnResamp = "final", 
                        savePredictions = "final", 
                        index = indexProbs,
                        sampling = NULL)

I build four linear regression models to fit the best prediction line between the explanatory variable and dependent variables.

#we are going to train the model and report the results using k-fold cross validation
model1_lm<-train(
    price ~ property_type + freehold_or_leasehold + distance_to_station + whether_old_or_new + longitude + altitude + num_tube_lines,
    train_data,
   method = "lm",
    trControl = control
   )

# summary of the results
summary(model1_lm)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1293507  -206055   -67429    78090  9642077 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            1043881.3    23968.7  43.552  < 2e-16 ***
## property_typeF         -443141.8    41532.2 -10.670  < 2e-16 ***
## property_typeS         -318883.2    21663.8 -14.720  < 2e-16 ***
## property_typeT         -326081.2    21114.6 -15.443  < 2e-16 ***
## freehold_or_leaseholdL -151472.2    36358.8  -4.166 3.12e-05 ***
## distance_to_station    -190490.3    11955.4 -15.933  < 2e-16 ***
## whether_old_or_newY      53165.5   178964.4   0.297    0.766    
## longitude              -391983.2    29858.4 -13.128  < 2e-16 ***
## altitude                  -870.6      188.0  -4.631 3.68e-06 ***
## num_tube_lines          171303.2     6511.9  26.306  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 473000 on 10489 degrees of freedom
## Multiple R-squared:  0.1745, Adjusted R-squared:  0.1737 
## F-statistic: 246.3 on 9 and 10489 DF,  p-value: < 2.2e-16

I analysed the significance of the variables using the p-value (<0.05) to find the model with the lowest value of Root Mean Square Error (RMSE) and highest R-squared. The greater the R-squared of a model - the better it performs. The first model that we built has an adjusted R-Squared of 0.1729 and is a terrible linear regression model. Now, we will fine tune the hyperparameters based on the importance of the variables.

# we can check variable importance as well
importance <- varImp(model1_lm, scale=TRUE)
plot(importance)

5.2.1 Predict the values in testing and out of sample data

# We can predict the testing values

predictions <- predict(model1_lm,test_data)

lr_results<-data.frame(  RMSE = RMSE(predictions, test_data$price), 
                            Rsquare = R2(predictions, test_data$price))

                            
lr_results                         
##       RMSE   Rsquare
## 1 467055.2 0.1791207
#We can predict prices for out of sample data the same way
predictions_oos <- predict(model1_lm,london_house_prices_2019_out_of_sample)

5.3 Second Linear Regression Model

set.seed(69)
# train_data <- train_data %>% 
#   mutate(average_distance_to_station1 = unlist(average_distance_to_station1)) %>% 
#   mutate(average_income_per_zone = unlist(average_income_per_zone)) %>% 
#   mutate(population_per_zone = unlist(population_per_zone))
train_data$average_distance_to_station <- unlist(train_data$average_distance_to_station)
train_data$average_income_per_zone <- unlist(train_data$average_income_per_zone)
train_data$population_per_zone <- unlist(train_data$population_per_zone)
train_data <- train_data[!is.na(train_data$price),]
train_data <- train_data[!is.na(train_data$population),]

# + total_floor_area + number_habitable_rooms + energy_consumption_current + latitude + longitude + average_income + type_of_closest_station + distance_to_station + average_distance_to_station_per_zone*(as.factor(london_zone)) + total_population_per_zone + average_income_per_zone + average_income_per_zone*(as.factor(london_zone))

model2_lm<-train(price ~ property_type + number_habitable_rooms + energy_consumption_current + longitude + average_income
              + type_of_closest_station + distance_to_station,
        train_data,
        method = "lm",
        trControl = control, 
        na.action=na.omit)

 #total_floor_area*I(as.factor(london_zone))

# summary of the results
summary(model2_lm)

5.3.1 Predict the values in testing and out of sample data

We measure the performance of our linear regression model with the RMSE and RSquare metrics.

# We can predict the testing values

predictions <- predict(model2_lm,test_data)

lr_results<-data.frame(  RMSE = RMSE(predictions, test_data$price), 
                            Rsquare = R2(predictions, test_data$price))

                            
lr_results                         

#We can predict prices for out of sample data the same way
predictions_oos <- predict(model2_lm,london_house_prices_2019_out_of_sample)

Now, we will fine tune the hyperparameters based on the importance of the variables.

# we can check variable importance as well
importance <- varImp(model2_lm, scale=TRUE)
plot(importance)

5.4 Third Linear Regression Model

set.seed(69)
#we are going to train the model and report the results using k-fold cross validation
model3_lm<-train(
    price ~ property_type  + total_floor_area + number_habitable_rooms  + latitude + longitude + average_income + type_of_closest_station + distance_to_station + total_floor_area*(as.factor(london_zone)),
    train_data,
   method = "lm",
    trControl = control
   )

 #total_floor_area*I(as.factor(london_zone))

# summary of the results
summary(model3_lm)

5.4.1 Predict the values in testing and out of sample data

# We can predict the testing values

predictions <- predict(model3_lm,test_data)

lr_results<-data.frame(  RMSE = RMSE(predictions, test_data$price), 
                            Rsquare = R2(predictions, test_data$price))

                            
lr_results                         

#We can predict prices for out of sample data the same way
predictions_oos <- predict(model3_lm,london_house_prices_2019_out_of_sample)

Now, we will fine tune the hyperparameters based on the importance of the variables.

# we can check variable importance as well
importance <- varImp(model3_lm, scale=TRUE)
plot(importance)

5.5 Fourth Linear Regression Model

set.seed(69)
#we are going to train the model and report the results using k-fold cross validation
model4_lm<-train(price ~ distance_to_station + num_tube_lines + property_type+ latitude + longitude + altitude + postcode_short + water_company + total_floor_area * (as.factor(london_zone)),
    train_data,
   method = "lm",
    trControl = control
   )
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
 #total_floor_area*I(as.factor(london_zone))

# summary of the results
summary(model4_lm)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3357899   -66014     3805    62228  3455619 
## 
## Coefficients: (3 not defined because of singularities)
##                                              Estimate Std. Error t value
## (Intercept)                                -2.105e+07  1.473e+07  -1.429
## distance_to_station                        -2.781e+04  6.867e+03  -4.050
## num_tube_lines                              3.144e+03  4.813e+03   0.653
## property_typeF                             -2.173e+05  1.154e+04 -18.842
## property_typeS                             -1.208e+05  1.016e+04 -11.892
## property_typeT                             -1.593e+05  1.045e+04 -15.248
## latitude                                    3.897e+05  2.865e+05   1.360
## longitude                                  -8.661e+05  1.799e+05  -4.816
## altitude                                    8.081e+02  1.923e+02   4.203
## postcode_shortBR2                           2.543e+04  2.972e+04   0.856
## postcode_shortBR3                           1.987e+04  3.036e+04   0.654
## postcode_shortBR4                          -1.117e+04  3.932e+04  -0.284
## postcode_shortBR5                           9.055e+03  3.971e+04   0.228
## postcode_shortBR6                           4.943e+04  3.867e+04   1.278
## postcode_shortBR7                           9.255e+04  3.841e+04   2.410
## postcode_shortCR0                          -1.188e+05  3.441e+04  -3.452
## postcode_shortCR2                          -1.327e+05  4.318e+04  -3.074
## postcode_shortCR3                          -1.425e+05  1.300e+05  -1.096
## postcode_shortCR4                          -8.797e+04  4.703e+04  -1.870
## postcode_shortCR5                          -1.976e+05  6.129e+04  -3.224
## postcode_shortCR7                          -1.570e+05  4.457e+04  -3.523
## postcode_shortCR8                          -1.887e+05  5.547e+04  -3.401
## postcode_shortDA1                           9.352e+04  6.796e+04   1.376
## postcode_shortDA14                          5.723e+04  4.399e+04   1.301
## postcode_shortDA15                          5.622e+04  3.558e+04   1.580
## postcode_shortDA16                          8.623e+03  3.855e+04   0.224
## postcode_shortDA17                          1.471e+04  5.497e+04   0.268
## postcode_shortDA18                         -4.240e+04  1.511e+05  -0.281
## postcode_shortDA5                           9.455e+04  4.451e+04   2.125
## postcode_shortDA6                           3.925e+04  5.823e+04   0.674
## postcode_shortDA7                           5.070e+04  4.298e+04   1.180
## postcode_shortDA8                           4.225e+04  5.130e+04   0.824
## postcode_shortE1                            1.596e+05  5.245e+04   3.044
## postcode_shortE10                           6.846e+04  5.768e+04   1.187
## postcode_shortE11                           7.047e+04  5.468e+04   1.289
## postcode_shortE12                           1.360e+04  6.606e+04   0.206
## postcode_shortE13                          -2.946e+04  5.619e+04  -0.524
## postcode_shortE14                           1.366e+05  4.438e+04   3.079
## postcode_shortE15                           5.772e+04  5.300e+04   1.089
## postcode_shortE16                           5.378e+04  5.201e+04   1.034
## postcode_shortE17                           7.967e+04  5.723e+04   1.392
## postcode_shortE18                           4.113e+04  6.395e+04   0.643
## postcode_shortE1W                           2.054e+05  5.830e+04   3.523
## postcode_shortE2                            1.822e+05  5.771e+04   3.157
## postcode_shortE3                            1.180e+05  5.406e+04   2.184
## postcode_shortE4                           -5.235e+04  6.557e+04  -0.798
## postcode_shortE5                            8.498e+04  6.101e+04   1.393
## postcode_shortE6                           -9.944e+03  5.163e+04  -0.193
## postcode_shortE7                            5.066e+04  6.062e+04   0.836
## postcode_shortE8                            2.086e+05  6.219e+04   3.355
## postcode_shortE9                            1.157e+05  5.923e+04   1.954
## postcode_shortEC1M                          4.673e+05  1.541e+05   3.033
## postcode_shortEC1N                          4.975e+05  1.291e+05   3.854
## postcode_shortEC1R                          2.562e+05  1.294e+05   1.979
## postcode_shortEC1V                          4.502e+05  8.235e+04   5.467
## postcode_shortEC1Y                         -1.406e+04  2.121e+05  -0.066
## postcode_shortEC2Y                          7.575e+05  1.288e+05   5.880
## postcode_shortEC4R                          6.083e+05  2.117e+05   2.874
## postcode_shortEC4V                          5.796e+05  2.117e+05   2.738
## postcode_shortEN1                          -1.349e+05  7.653e+04  -1.763
## postcode_shortEN2                          -1.389e+05  8.003e+04  -1.735
## postcode_shortEN3                          -1.583e+05  8.103e+04  -1.954
## postcode_shortEN4                          -1.120e+05  8.867e+04  -1.263
## postcode_shortEN5                          -1.518e+05  8.683e+04  -1.748
## postcode_shortEN8                          -1.311e+05  1.437e+05  -0.912
## postcode_shortHA0                          -2.402e+05  8.351e+04  -2.876
## postcode_shortHA1                          -2.863e+05  8.830e+04  -3.243
## postcode_shortHA2                          -3.018e+05  8.850e+04  -3.411
## postcode_shortHA3                          -2.947e+05  8.681e+04  -3.395
## postcode_shortHA4                          -3.235e+05  9.316e+04  -3.472
## postcode_shortHA5                          -2.747e+05  9.313e+04  -2.949
## postcode_shortHA6                          -3.311e+05  1.044e+05  -3.173
## postcode_shortHA7                          -2.341e+05  9.004e+04  -2.600
## postcode_shortHA8                          -2.585e+05  8.460e+04  -3.056
## postcode_shortHA9                          -2.440e+05  8.324e+04  -2.931
## postcode_shortIG1                          -2.517e+04  7.217e+04  -0.349
## postcode_shortIG11                          1.281e+04  7.338e+04   0.175
## postcode_shortIG2                           1.312e+04  7.679e+04   0.171
## postcode_shortIG3                           1.708e+04  7.900e+04   0.216
## postcode_shortIG4                          -1.676e+04  8.633e+04  -0.194
## postcode_shortIG5                          -7.402e+04  7.359e+04  -1.006
## postcode_shortIG6                          -1.919e+04  6.759e+04  -0.284
## postcode_shortIG7                          -2.305e+02  8.104e+04  -0.003
## postcode_shortIG8                           1.065e+04  6.369e+04   0.167
## postcode_shortIG9                          -2.622e+04  1.345e+05  -0.195
## postcode_shortKT1                          -1.130e+05  7.292e+04  -1.549
## postcode_shortKT2                          -2.808e+03  6.676e+04  -0.042
## postcode_shortKT3                          -1.097e+05  6.261e+04  -1.753
## postcode_shortKT4                          -1.483e+05  6.402e+04  -2.317
## postcode_shortKT5                          -1.372e+05  6.947e+04  -1.975
## postcode_shortKT6                          -1.188e+05  6.785e+04  -1.751
## postcode_shortKT8                           1.722e+05  2.179e+05   0.790
## postcode_shortKT9                          -2.665e+05  7.193e+04  -3.705
## postcode_shortN1                            2.475e+05  5.270e+04   4.696
## postcode_shortN10                           1.209e+05  6.955e+04   1.739
## postcode_shortN11                          -5.059e+04  7.424e+04  -0.681
## postcode_shortN12                          -9.600e+04  7.856e+04  -1.222
## postcode_shortN13                          -1.030e+05  7.357e+04  -1.399
## postcode_shortN14                          -9.699e+04  7.537e+04  -1.287
## postcode_shortN15                          -6.555e+03  6.627e+04  -0.099
## postcode_shortN16                           1.173e+05  5.787e+04   2.028
## postcode_shortN17                          -6.956e+04  6.731e+04  -1.034
## postcode_shortN18                          -1.733e+05  7.807e+04  -2.220
## postcode_shortN19                          -3.966e+04  6.744e+04  -0.588
## postcode_shortN2                            9.117e+04  7.208e+04   1.265
## postcode_shortN20                          -5.159e+03  8.341e+04  -0.062
## postcode_shortN21                          -5.792e+04  7.598e+04  -0.762
## postcode_shortN22                          -1.135e+04  7.524e+04  -0.151
## postcode_shortN3                           -4.506e+04  7.847e+04  -0.574
## postcode_shortN4                            8.272e+04  6.072e+04   1.362
## postcode_shortN5                            1.905e+05  6.115e+04   3.116
## postcode_shortN6                            4.323e+05  6.920e+04   6.247
## postcode_shortN7                            9.693e+04  6.196e+04   1.564
## postcode_shortN8                            1.112e+05  6.436e+04   1.728
## postcode_shortN9                           -1.398e+05  7.236e+04  -1.933
## postcode_shortNW1                           2.232e+05  6.049e+04   3.689
## postcode_shortNW10                         -1.015e+05  6.913e+04  -1.469
## postcode_shortNW11                         -4.075e+02  7.282e+04  -0.006
## postcode_shortNW2                          -1.291e+05  6.833e+04  -1.889
## postcode_shortNW3                           4.183e+05  6.412e+04   6.523
## postcode_shortNW4                          -2.348e+05  7.805e+04  -3.009
## postcode_shortNW5                           1.263e+05  6.515e+04   1.939
## postcode_shortNW6                           1.691e+05  6.371e+04   2.654
## postcode_shortNW7                          -1.946e+05  8.148e+04  -2.388
## postcode_shortNW8                           6.047e+05  6.533e+04   9.257
## postcode_shortNW9                          -1.975e+05  8.018e+04  -2.463
## postcode_shortRM1                           6.004e+04  8.851e+04   0.678
## postcode_shortRM10                          4.943e+04  8.005e+04   0.618
## postcode_shortRM11                          9.725e+04  8.519e+04   1.142
## postcode_shortRM12                          9.154e+04  8.230e+04   1.112
## postcode_shortRM13                          6.715e+04  7.784e+04   0.863
## postcode_shortRM14                          2.138e+05  8.891e+04   2.404
## postcode_shortRM2                           1.281e+05  9.224e+04   1.388
## postcode_shortRM3                           6.999e+04  9.127e+04   0.767
## postcode_shortRM5                           4.844e+04  8.990e+04   0.539
## postcode_shortRM6                           2.274e+04  8.350e+04   0.272
## postcode_shortRM7                           6.738e+04  8.290e+04   0.813
## postcode_shortRM8                           3.930e+04  7.792e+04   0.504
## postcode_shortRM9                           2.689e+04  7.696e+04   0.349
## postcode_shortSE1                           2.634e+05  4.969e+04   5.301
## postcode_shortSE10                          2.207e+05  4.527e+04   4.876
## postcode_shortSE11                          2.234e+05  7.107e+04   3.144
## postcode_shortSE12                          1.572e+04  3.640e+04   0.432
## postcode_shortSE13                          8.580e+04  3.894e+04   2.203
## postcode_shortSE14                          8.911e+04  5.585e+04   1.595
## postcode_shortSE15                          9.885e+04  4.199e+04   2.354
## postcode_shortSE16                          1.463e+05  4.929e+04   2.967
## postcode_shortSE17                          1.683e+05  5.788e+04   2.908
## postcode_shortSE18                          1.342e+04  3.981e+04   0.337
## postcode_shortSE19                         -3.260e+04  4.058e+04  -0.803
## postcode_shortSE2                           5.271e+04  5.557e+04   0.949
## postcode_shortSE20                         -3.134e+03  3.859e+04  -0.081
## postcode_shortSE21                          1.844e+05  4.984e+04   3.699
## postcode_shortSE22                          1.443e+05  4.323e+04   3.339
## postcode_shortSE23                          3.605e+04  3.563e+04   1.012
## postcode_shortSE24                          8.600e+04  5.404e+04   1.591
## postcode_shortSE25                         -8.505e+04  3.793e+04  -2.242
## postcode_shortSE26                         -9.359e+03  3.677e+04  -0.255
## postcode_shortSE27                         -4.129e+04  4.555e+04  -0.906
## postcode_shortSE28                          5.557e+04  5.619e+04   0.989
## postcode_shortSE3                           1.794e+05  4.151e+04   4.320
## postcode_shortSE4                           7.426e+04  4.466e+04   1.663
## postcode_shortSE5                           5.158e+04  4.648e+04   1.110
## postcode_shortSE6                          -3.316e+04  3.392e+04  -0.977
## postcode_shortSE7                           4.311e+04  4.621e+04   0.933
## postcode_shortSE8                           9.095e+04  5.521e+04   1.647
## postcode_shortSE9                          -2.990e+03  2.952e+04  -0.101
## postcode_shortSM1                          -1.731e+05  5.740e+04  -3.016
## postcode_shortSM2                          -1.579e+05  6.223e+04  -2.538
## postcode_shortSM3                          -1.472e+05  6.694e+04  -2.198
## postcode_shortSM4                          -1.345e+05  5.818e+04  -2.312
## postcode_shortSM5                          -1.325e+05  5.573e+04  -2.377
## postcode_shortSM6                          -1.474e+05  5.419e+04  -2.720
## postcode_shortSW10                          6.564e+05  6.704e+04   9.792
## postcode_shortSW11                          1.643e+05  4.911e+04   3.346
## postcode_shortSW12                          1.525e+05  4.840e+04   3.151
## postcode_shortSW13                          4.582e+05  6.764e+04   6.774
## postcode_shortSW14                          1.030e+05  6.693e+04   1.539
## postcode_shortSW15                          1.696e+04  5.837e+04   0.291
## postcode_shortSW16                         -2.979e+04  4.082e+04  -0.730
## postcode_shortSW17                          7.935e+04  4.539e+04   1.748
## postcode_shortSW18                          1.330e+05  4.888e+04   2.721
## postcode_shortSW19                          1.099e+05  4.920e+04   2.233
## postcode_shortSW1E                          7.225e+05  2.121e+05   3.406
## postcode_shortSW1H                          2.452e+05  1.289e+05   1.902
## postcode_shortSW1P                          1.446e+05  8.431e+04   1.715
## postcode_shortSW1V                          3.896e+05  6.506e+04   5.989
## postcode_shortSW1W                          1.034e+06  7.881e+04  13.118
## postcode_shortSW1X                          1.599e+06  8.144e+04  19.630
## postcode_shortSW1Y                          6.517e+05  2.124e+05   3.069
## postcode_shortSW2                           2.330e+04  4.242e+04   0.549
## postcode_shortSW20                          2.311e+04  5.571e+04   0.415
## postcode_shortSW3                           1.081e+06  6.170e+04  17.513
## postcode_shortSW4                           1.624e+05  4.992e+04   3.253
## postcode_shortSW5                           5.232e+05  8.195e+04   6.384
## postcode_shortSW6                           3.165e+05  5.537e+04   5.716
## postcode_shortSW7                           6.018e+05  6.708e+04   8.971
## postcode_shortSW8                           2.456e+05  5.851e+04   4.197
## postcode_shortSW9                           1.423e+05  5.890e+04   2.416
## postcode_shortTN16                         -5.899e+05  2.145e+05  -2.750
## postcode_shortTW1                           5.864e+04  7.232e+04   0.811
## postcode_shortTW10                          2.588e+05  6.984e+04   3.705
## postcode_shortTW11                         -6.732e+03  7.257e+04  -0.093
## postcode_shortTW12                         -2.551e+05  8.307e+04  -3.071
## postcode_shortTW13                         -3.186e+05  8.689e+04  -3.666
## postcode_shortTW14                         -3.225e+05  9.311e+04  -3.463
## postcode_shortTW15                         -3.576e+05  2.241e+05  -1.596
## postcode_shortTW2                          -1.680e+05  7.591e+04  -2.213
## postcode_shortTW3                          -3.631e+05  8.304e+04  -4.372
## postcode_shortTW4                          -3.531e+05  9.206e+04  -3.835
## postcode_shortTW5                          -3.363e+05  9.086e+04  -3.701
## postcode_shortTW7                          -2.297e+05  7.444e+04  -3.085
## postcode_shortTW8                          -1.814e+05  7.634e+04  -2.376
## postcode_shortTW9                           2.951e+04  6.848e+04   0.431
## postcode_shortUB1                          -2.981e+05  9.676e+04  -3.081
## postcode_shortUB10                         -3.874e+05  1.006e+05  -3.852
## postcode_shortUB2                          -3.368e+05  9.116e+04  -3.695
## postcode_shortUB3                          -3.721e+05  9.390e+04  -3.963
## postcode_shortUB4                          -3.347e+05  9.216e+04  -3.632
## postcode_shortUB5                          -3.225e+05  8.902e+04  -3.623
## postcode_shortUB6                          -2.649e+05  8.807e+04  -3.008
## postcode_shortUB7                          -4.079e+05  1.006e+05  -4.057
## postcode_shortUB8                          -4.089e+05  1.004e+05  -4.074
## postcode_shortW10                           1.881e+05  7.476e+04   2.516
## postcode_shortW11                           1.085e+06  7.071e+04  15.339
## postcode_shortW12                           1.094e+04  6.881e+04   0.159
## postcode_shortW13                          -1.054e+05  7.522e+04  -1.401
## postcode_shortW14                           1.314e+05  6.530e+04   2.013
## postcode_shortW1B                           4.983e+05  2.146e+05   2.322
## postcode_shortW1F                           7.614e+05  2.128e+05   3.578
## postcode_shortW1G                           9.883e+05  1.303e+05   7.583
## postcode_shortW1H                           3.420e+05  9.074e+04   3.769
## postcode_shortW1J                           1.096e+06  2.125e+05   5.158
## postcode_shortW1K                           1.227e+06  1.155e+05  10.630
## postcode_shortW1T                           6.834e+05  2.127e+05   3.214
## postcode_shortW1U                           4.896e+05  9.623e+04   5.088
## postcode_shortW1W                           3.663e+05  1.158e+05   3.164
## postcode_shortW2                            3.928e+05  6.205e+04   6.331
## postcode_shortW3                           -1.043e+05  6.870e+04  -1.518
## postcode_shortW4                            1.878e+05  6.490e+04   2.894
## postcode_shortW5                           -7.768e+04  7.101e+04  -1.094
## postcode_shortW6                            1.608e+05  6.559e+04   2.452
## postcode_shortW7                           -2.008e+05  7.894e+04  -2.544
## postcode_shortW8                            6.596e+05  6.906e+04   9.552
## postcode_shortW9                            2.506e+05  7.149e+04   3.505
## postcode_shortWC1A                                 NA         NA      NA
## postcode_shortWC1B                          8.060e+05  2.127e+05   3.789
## postcode_shortWC1E                                 NA         NA      NA
## postcode_shortWC1H                          5.591e+05  1.301e+05   4.299
## postcode_shortWC1N                          3.429e+05  2.124e+05   1.615
## postcode_shortWC1X                          4.478e+05  1.161e+05   3.856
## postcode_shortWC2B                          7.300e+05  2.123e+05   3.438
## postcode_shortWC2E                          4.102e+05  1.541e+05   2.663
## postcode_shortWC2H                          5.769e+05  2.123e+05   2.717
## postcode_shortWC2N                          2.976e+05  2.121e+05   1.403
## postcode_shortCR6                                  NA         NA      NA
## `water_companyEssex & Suffolk Water`        4.029e+04  5.627e+04   0.716
## `water_companyLeep Utilities`               7.843e+04  1.494e+05   0.525
## `water_companySES Water`                    4.832e+04  3.782e+04   1.278
## `water_companyThames Water`                 5.859e+04  2.632e+04   2.226
## total_floor_area                            1.979e+04  1.827e+02 108.370
## `as.factor(london_zone)2`                   7.472e+05  2.646e+04  28.243
## `as.factor(london_zone)3`                   9.620e+05  3.026e+04  31.786
## `as.factor(london_zone)4`                   1.176e+06  3.288e+04  35.754
## `as.factor(london_zone)5`                   1.206e+06  3.467e+04  34.772
## `as.factor(london_zone)6`                   1.241e+06  3.693e+04  33.593
## `as.factor(london_zone)7`                   1.451e+06  7.619e+05   1.904
## `total_floor_area:as.factor(london_zone)2` -1.025e+04  2.080e+02 -49.288
## `total_floor_area:as.factor(london_zone)3` -1.335e+04  2.021e+02 -66.058
## `total_floor_area:as.factor(london_zone)4` -1.579e+04  2.157e+02 -73.178
## `total_floor_area:as.factor(london_zone)5` -1.612e+04  2.265e+02 -71.154
## `total_floor_area:as.factor(london_zone)6` -1.650e+04  2.306e+02 -71.549
## `total_floor_area:as.factor(london_zone)7` -1.888e+04  7.610e+03  -2.481
##                                            Pr(>|t|)    
## (Intercept)                                0.153046    
## distance_to_station                        5.16e-05 ***
## num_tube_lines                             0.513579    
## property_typeF                              < 2e-16 ***
## property_typeS                              < 2e-16 ***
## property_typeT                              < 2e-16 ***
## latitude                                   0.173820    
## longitude                                  1.49e-06 ***
## altitude                                   2.65e-05 ***
## postcode_shortBR2                          0.392174    
## postcode_shortBR3                          0.512828    
## postcode_shortBR4                          0.776298    
## postcode_shortBR5                          0.819613    
## postcode_shortBR6                          0.201211    
## postcode_shortBR7                          0.015979 *  
## postcode_shortCR0                          0.000558 ***
## postcode_shortCR2                          0.002120 ** 
## postcode_shortCR3                          0.273201    
## postcode_shortCR4                          0.061462 .  
## postcode_shortCR5                          0.001269 ** 
## postcode_shortCR7                          0.000428 ***
## postcode_shortCR8                          0.000674 ***
## postcode_shortDA1                          0.168819    
## postcode_shortDA14                         0.193279    
## postcode_shortDA15                         0.114105    
## postcode_shortDA16                         0.823004    
## postcode_shortDA17                         0.789003    
## postcode_shortDA18                         0.779089    
## postcode_shortDA5                          0.033648 *  
## postcode_shortDA6                          0.500303    
## postcode_shortDA7                          0.238117    
## postcode_shortDA8                          0.410216    
## postcode_shortE1                           0.002344 ** 
## postcode_shortE10                          0.235311    
## postcode_shortE11                          0.197449    
## postcode_shortE12                          0.836892    
## postcode_shortE13                          0.600050    
## postcode_shortE14                          0.002086 ** 
## postcode_shortE15                          0.276203    
## postcode_shortE16                          0.301144    
## postcode_shortE17                          0.163946    
## postcode_shortE18                          0.520112    
## postcode_shortE1W                          0.000428 ***
## postcode_shortE2                           0.001599 ** 
## postcode_shortE3                           0.029013 *  
## postcode_shortE4                           0.424660    
## postcode_shortE5                           0.163730    
## postcode_shortE6                           0.847289    
## postcode_shortE7                           0.403345    
## postcode_shortE8                           0.000797 ***
## postcode_shortE9                           0.050753 .  
## postcode_shortEC1M                         0.002427 ** 
## postcode_shortEC1N                         0.000117 ***
## postcode_shortEC1R                         0.047802 *  
## postcode_shortEC1V                         4.70e-08 ***
## postcode_shortEC1Y                         0.947141    
## postcode_shortEC2Y                         4.23e-09 ***
## postcode_shortEC4R                         0.004061 ** 
## postcode_shortEC4V                         0.006199 ** 
## postcode_shortEN1                          0.077938 .  
## postcode_shortEN2                          0.082734 .  
## postcode_shortEN3                          0.050742 .  
## postcode_shortEN4                          0.206495    
## postcode_shortEN5                          0.080525 .  
## postcode_shortEN8                          0.361633    
## postcode_shortHA0                          0.004035 ** 
## postcode_shortHA1                          0.001189 ** 
## postcode_shortHA2                          0.000650 ***
## postcode_shortHA3                          0.000690 ***
## postcode_shortHA4                          0.000518 ***
## postcode_shortHA5                          0.003194 ** 
## postcode_shortHA6                          0.001512 ** 
## postcode_shortHA7                          0.009333 ** 
## postcode_shortHA8                          0.002251 ** 
## postcode_shortHA9                          0.003381 ** 
## postcode_shortIG1                          0.727283    
## postcode_shortIG11                         0.861450    
## postcode_shortIG2                          0.864367    
## postcode_shortIG3                          0.828860    
## postcode_shortIG4                          0.846065    
## postcode_shortIG5                          0.314462    
## postcode_shortIG6                          0.776424    
## postcode_shortIG7                          0.997730    
## postcode_shortIG8                          0.867174    
## postcode_shortIG9                          0.845451    
## postcode_shortKT1                          0.121385    
## postcode_shortKT2                          0.966454    
## postcode_shortKT3                          0.079666 .  
## postcode_shortKT4                          0.020520 *  
## postcode_shortKT5                          0.048316 *  
## postcode_shortKT6                          0.080012 .  
## postcode_shortKT8                          0.429289    
## postcode_shortKT9                          0.000213 ***
## postcode_shortN1                           2.69e-06 ***
## postcode_shortN10                          0.082112 .  
## postcode_shortN11                          0.495610    
## postcode_shortN12                          0.221783    
## postcode_shortN13                          0.161708    
## postcode_shortN14                          0.198151    
## postcode_shortN15                          0.921206    
## postcode_shortN16                          0.042620 *  
## postcode_shortN17                          0.301382    
## postcode_shortN18                          0.026424 *  
## postcode_shortN19                          0.556475    
## postcode_shortN2                           0.205974    
## postcode_shortN20                          0.950679    
## postcode_shortN21                          0.445841    
## postcode_shortN22                          0.880093    
## postcode_shortN3                           0.565812    
## postcode_shortN4                           0.173107    
## postcode_shortN5                           0.001839 ** 
## postcode_shortN6                           4.35e-10 ***
## postcode_shortN7                           0.117762    
## postcode_shortN8                           0.084049 .  
## postcode_shortN9                           0.053318 .  
## postcode_shortNW1                          0.000226 ***
## postcode_shortNW10                         0.141999    
## postcode_shortNW11                         0.995535    
## postcode_shortNW2                          0.058898 .  
## postcode_shortNW3                          7.21e-11 ***
## postcode_shortNW4                          0.002630 ** 
## postcode_shortNW5                          0.052540 .  
## postcode_shortNW6                          0.007960 ** 
## postcode_shortNW7                          0.016965 *  
## postcode_shortNW8                           < 2e-16 ***
## postcode_shortNW9                          0.013805 *  
## postcode_shortRM1                          0.497586    
## postcode_shortRM10                         0.536884    
## postcode_shortRM11                         0.253679    
## postcode_shortRM12                         0.266036    
## postcode_shortRM13                         0.388329    
## postcode_shortRM14                         0.016223 *  
## postcode_shortRM2                          0.165069    
## postcode_shortRM3                          0.443171    
## postcode_shortRM5                          0.590020    
## postcode_shortRM6                          0.785357    
## postcode_shortRM7                          0.416374    
## postcode_shortRM8                          0.614020    
## postcode_shortRM9                          0.726757    
## postcode_shortSE1                          1.17e-07 ***
## postcode_shortSE10                         1.10e-06 ***
## postcode_shortSE11                         0.001671 ** 
## postcode_shortSE12                         0.665919    
## postcode_shortSE13                         0.027607 *  
## postcode_shortSE14                         0.110634    
## postcode_shortSE15                         0.018591 *  
## postcode_shortSE16                         0.003012 ** 
## postcode_shortSE17                         0.003644 ** 
## postcode_shortSE18                         0.736061    
## postcode_shortSE19                         0.421748    
## postcode_shortSE2                          0.342871    
## postcode_shortSE20                         0.935282    
## postcode_shortSE21                         0.000218 ***
## postcode_shortSE22                         0.000845 ***
## postcode_shortSE23                         0.311738    
## postcode_shortSE24                         0.111589    
## postcode_shortSE25                         0.024984 *  
## postcode_shortSE26                         0.799070    
## postcode_shortSE27                         0.364699    
## postcode_shortSE28                         0.322704    
## postcode_shortSE3                          1.57e-05 ***
## postcode_shortSE4                          0.096400 .  
## postcode_shortSE5                          0.267162    
## postcode_shortSE6                          0.328402    
## postcode_shortSE7                          0.350851    
## postcode_shortSE8                          0.099522 .  
## postcode_shortSE9                          0.919331    
## postcode_shortSM1                          0.002567 ** 
## postcode_shortSM2                          0.011162 *  
## postcode_shortSM3                          0.027945 *  
## postcode_shortSM4                          0.020801 *  
## postcode_shortSM5                          0.017478 *  
## postcode_shortSM6                          0.006534 ** 
## postcode_shortSW10                          < 2e-16 ***
## postcode_shortSW11                         0.000822 ***
## postcode_shortSW12                         0.001629 ** 
## postcode_shortSW13                         1.33e-11 ***
## postcode_shortSW14                         0.123838    
## postcode_shortSW15                         0.771337    
## postcode_shortSW16                         0.465513    
## postcode_shortSW17                         0.080443 .  
## postcode_shortSW18                         0.006510 ** 
## postcode_shortSW19                         0.025546 *  
## postcode_shortSW1E                         0.000661 ***
## postcode_shortSW1H                         0.057215 .  
## postcode_shortSW1P                         0.086352 .  
## postcode_shortSW1V                         2.19e-09 ***
## postcode_shortSW1W                          < 2e-16 ***
## postcode_shortSW1X                          < 2e-16 ***
## postcode_shortSW1Y                         0.002154 ** 
## postcode_shortSW2                          0.582924    
## postcode_shortSW20                         0.678263    
## postcode_shortSW3                           < 2e-16 ***
## postcode_shortSW4                          0.001146 ** 
## postcode_shortSW5                          1.80e-10 ***
## postcode_shortSW6                          1.12e-08 ***
## postcode_shortSW7                           < 2e-16 ***
## postcode_shortSW8                          2.73e-05 ***
## postcode_shortSW9                          0.015728 *  
## postcode_shortTN16                         0.005962 ** 
## postcode_shortTW1                          0.417453    
## postcode_shortTW10                         0.000213 ***
## postcode_shortTW11                         0.926087    
## postcode_shortTW12                         0.002140 ** 
## postcode_shortTW13                         0.000247 ***
## postcode_shortTW14                         0.000536 ***
## postcode_shortTW15                         0.110517    
## postcode_shortTW2                          0.026916 *  
## postcode_shortTW3                          1.24e-05 ***
## postcode_shortTW4                          0.000126 ***
## postcode_shortTW5                          0.000216 ***
## postcode_shortTW7                          0.002039 ** 
## postcode_shortTW8                          0.017510 *  
## postcode_shortTW9                          0.666512    
## postcode_shortUB1                          0.002070 ** 
## postcode_shortUB10                         0.000118 ***
## postcode_shortUB2                          0.000221 ***
## postcode_shortUB3                          7.45e-05 ***
## postcode_shortUB4                          0.000283 ***
## postcode_shortUB5                          0.000292 ***
## postcode_shortUB6                          0.002640 ** 
## postcode_shortUB7                          5.01e-05 ***
## postcode_shortUB8                          4.65e-05 ***
## postcode_shortW10                          0.011880 *  
## postcode_shortW11                           < 2e-16 ***
## postcode_shortW12                          0.873677    
## postcode_shortW13                          0.161324    
## postcode_shortW14                          0.044167 *  
## postcode_shortW1B                          0.020252 *  
## postcode_shortW1F                          0.000348 ***
## postcode_shortW1G                          3.68e-14 ***
## postcode_shortW1H                          0.000165 ***
## postcode_shortW1J                          2.55e-07 ***
## postcode_shortW1K                           < 2e-16 ***
## postcode_shortW1T                          0.001315 ** 
## postcode_shortW1U                          3.68e-07 ***
## postcode_shortW1W                          0.001563 ** 
## postcode_shortW2                           2.55e-10 ***
## postcode_shortW3                           0.128995    
## postcode_shortW4                           0.003814 ** 
## postcode_shortW5                           0.274005    
## postcode_shortW6                           0.014231 *  
## postcode_shortW7                           0.010986 *  
## postcode_shortW8                            < 2e-16 ***
## postcode_shortW9                           0.000459 ***
## postcode_shortWC1A                               NA    
## postcode_shortWC1B                         0.000152 ***
## postcode_shortWC1E                               NA    
## postcode_shortWC1H                         1.74e-05 ***
## postcode_shortWC1N                         0.106434    
## postcode_shortWC1X                         0.000116 ***
## postcode_shortWC2B                         0.000588 ***
## postcode_shortWC2E                         0.007766 ** 
## postcode_shortWC2H                         0.006592 ** 
## postcode_shortWC2N                         0.160637    
## postcode_shortCR6                                NA    
## `water_companyEssex & Suffolk Water`       0.474008    
## `water_companyLeep Utilities`              0.599760    
## `water_companySES Water`                   0.201412    
## `water_companyThames Water`                0.026065 *  
## total_floor_area                            < 2e-16 ***
## `as.factor(london_zone)2`                   < 2e-16 ***
## `as.factor(london_zone)3`                   < 2e-16 ***
## `as.factor(london_zone)4`                   < 2e-16 ***
## `as.factor(london_zone)5`                   < 2e-16 ***
## `as.factor(london_zone)6`                   < 2e-16 ***
## `as.factor(london_zone)7`                  0.056942 .  
## `total_floor_area:as.factor(london_zone)2`  < 2e-16 ***
## `total_floor_area:as.factor(london_zone)3`  < 2e-16 ***
## `total_floor_area:as.factor(london_zone)4`  < 2e-16 ***
## `total_floor_area:as.factor(london_zone)5`  < 2e-16 ***
## `total_floor_area:as.factor(london_zone)6`  < 2e-16 ***
## `total_floor_area:as.factor(london_zone)7` 0.013132 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 206200 on 10229 degrees of freedom
## Multiple R-squared:  0.8469, Adjusted R-squared:  0.8429 
## F-statistic: 210.4 on 269 and 10229 DF,  p-value: < 2.2e-16

5.5.1 Predict the values in testing and out of sample data

# We can predict the testing values
set.seed(69)
predictions <- predict(model4_lm,test_data)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
lr_results<-data.frame(RMSE = RMSE(predictions, test_data$price), 
                       Rsquare = R2(predictions, test_data$price))

                            
lr_results                         
##       RMSE   Rsquare
## 1 217841.1 0.8258659
#We can predict prices for out of sample data the same way
predictions_oos <- predict(model4_lm,london_house_prices_2019_out_of_sample)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

The fourth linear regression has the best performance out of the four since it has an adjusted R-Square of 0.85 on the training set and 0.8 on the testing set. Now let us take a look at the importance of the final variables.

# we can check variable importance as well
importance <- varImp(model4_lm, scale=TRUE)
plot(importance)

The number of habitable rooms, average income, and type of closest station are the three most important factors from our linear regression.

5.6 Fit a tree model

Next, I build a decision tree model for our training dataset in order to compare its performance to that of the linear regression model.

model1_tree <- train(
  price ~ distance_to_station + water_company + property_type + whether_old_or_new + latitude + longitude,
  train_data,
  method = "rpart",
  trControl = control,
  tuneLength=10
    )

#You can view how the tree performs
model1_tree$results

#You can view the final tree
rpart.plot(model1_tree$finalModel)

#you can also visualize the variable importance
importance <- varImp(model1_tree, scale=TRUE)
plot(importance)


#Predict the probabilities using the tree model in testing data
predictions_tree <- predict(model1_tree,test_data)

tree_results<-data.frame(  RMSE = RMSE(predictions_tree, test_data$price), 
                            Rsquare = R2(predictions_tree, test_data$price))
tree_results 

I use an initial set of parameters to build a model with a tune length of 10. The adjusted R-square starts off at 46.16%.

model2_tree <- train(
  price ~ distance_to_station + num_tube_lines + property_type + latitude + longitude + altitude + postcode_short + water_company + total_floor_area*(as.factor(london_zone)),
  train_data,
  method = "rpart",
  trControl = control,
  tuneLength=10
  )

#You can view how the tree performs
model2_tree$results

#You can view the final tree
rpart.plot(model2_tree$finalModel)

#you can also visualize the variable importance
importance <- varImp(model2_tree, scale=TRUE)
plot(importance)


#Predict the probabilities using the tree model in testing data
predictions_tree <- predict(model2_tree,test_data)

tree_results<-data.frame(  RMSE = RMSE(predictions_tree, test_data$price), 
                            Rsquare = R2(predictions_tree, test_data$price))
tree_results 
set.seed(69)
custom_grid <- expand.grid(cp = seq( 0.00005, 0.00015,0.00005))
model3_tree <- train(
  price ~ total_floor_area + average_income + longitude + latitude + property_type + london_zone + distance_to_station + postcode_short + district + energy_consumption_current + total_floor_area*(as.factor(london_zone)),
  train_data, 
  method = "rpart",
  metric="Rsquared",
  trControl = control,
  tuneGrid=custom_grid)

print(model3_tree)
plot(model3_tree)
rpart.plot(model3_tree$finalModel)

#Predict the probabilities using the tree model in testing data
predictions_tree <- predict(model3_tree$finalModel,test_data)

tree_results<-data.frame(  RMSE = RMSE(predictions_tree, test_data$price), 
                            Rsquare = R2(predictions_tree, test_data$price))
tree_results 

I then use pruning to tune the hyperparameters to make the most efficient model by selecting appropriate values of the complexity parameter. The complexity parameter determines the size of the decision tree and aids in choosing the optimal value of the size of the tree. The expand grid function now determines the range of the complexity parameter and find the best R-squared.

set.seed(69)
custom_grid <- expand.grid(cp = seq( 0.00005, 0.00015,0.00005))
model4_tree <- train(
  price ~ total_floor_area + average_income + longitude + latitude+ current_energy_rating+ num_tube_lines +type_of_closest_station + property_type + london_zone + distance_to_station +postcode_short + district + energy_consumption_current,
  train_data, 
  method = "rpart",
  metric="Rsquared",
  trControl = control,
  tuneGrid=custom_grid)

print(model4_tree)
## CART 
## 
## 10499 samples
##    13 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (15 fold) 
## Summary of sample sizes: 9798, 9798, 9799, 9799, 9799, 9800, ... 
## Resampling results across tuning parameters:
## 
##   cp       RMSE      Rsquared   MAE     
##   0.00005  224271.7  0.8116812  115348.9
##   0.00010  225054.7  0.8102269  117334.4
##   0.00015  225939.6  0.8087458  118797.4
## 
## Rsquared was used to select the optimal model using the largest value.
## The final value used for the model was cp = 5e-05.
plot(model4_tree)

rpart.plot(model4_tree$finalModel)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

#Predict the probabilities using the tree model in testing data
predictions_tree <- predict(model4_tree,test_data)
tree_results<-data.frame(  RMSE = RMSE(predictions_tree, test_data$price), 
                            Rsquare = R2(predictions_tree, test_data$price))
tree_results
##       RMSE   Rsquare
## 1 273477.4 0.7491902

The decision tree yields an adjusted R-Square of 82.57% on the testing set. The linear regression model that I built performs better because there is a large number of features in the dataset and low noise. Decision trees are also better suited at predicting categorical independent variables whereas their performance is compromised while predicting discrete independent variables such as price.

The linear regression model that I built performs better because there is a large number of features in the dataset and low noise. Decision trees are also better suited at predicting categorical independent variables whereas their performance is compromised while predicting discrete independent variables such as price.

5.7 Gradient Boosting

Next, I build a model using the Gradient Boosting Machine by adding new variables. In order to ensure the best performance of this model, I added features which were significantly increasing R-Squared values in the linear regression and decision tree models. There exist three key elements to gradient boosting: optimising a loss function, building weak learners to make predictions, and finally building an additive model that combines all the weak learners to increase prediction power. In order to optimise the performance of my GBM I use pruning again to tune the hyper parameters to make the most efficient model by selecting the ideal values of the complexity parameter. I tuned the n.trees, interaction.dept, and shrinkage variables specifically after tuning the model with the best fit parameters.

set.seed(69)
modelLookup("gbm")
##   model         parameter                   label forReg forClass probModel
## 1   gbm           n.trees   # Boosting Iterations   TRUE     TRUE      TRUE
## 2   gbm interaction.depth          Max Tree Depth   TRUE     TRUE      TRUE
## 3   gbm         shrinkage               Shrinkage   TRUE     TRUE      TRUE
## 4   gbm    n.minobsinnode Min. Terminal Node Size   TRUE     TRUE      TRUE
custom_grid<-expand.grid(interaction.depth = 9,n.trees = 500,shrinkage = 0.075, n.minobsinnode = 5)

model1_gbm <- train(price ~ latitude + longitude + altitude + distance_to_station + water_company + property_type  + postcode_short + total_floor_area*(as.factor(london_zone)), data=train_data,
                 method = "gbm", 
                 trControl = control,
                 tuneGrid =custom_grid,
                 metric = "RMSE",
                 verbose = FALSE
                 )
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 58: postcode_shortEC1Y has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 231: postcode_shortW1B has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 252: postcode_shortWC1N has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 254: postcode_shortWC2B has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 232: postcode_shortW1F has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 249: postcode_shortWC1B has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 202: postcode_shortTN16 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 61: postcode_shortEC4V has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 209: postcode_shortTW15 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 257: postcode_shortWC2N has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 235: postcode_shortW1J has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 94: postcode_shortKT8 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 192: postcode_shortSW1Y has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 237: postcode_shortW1T has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 256: postcode_shortWC2H has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 60: postcode_shortEC4R has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 186: postcode_shortSW1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 248: postcode_shortWC1A has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 250: postcode_shortWC1E has no variation.
## Warning in (function (x, y, offset = NULL, misc = NULL, distribution =
## "bernoulli", : variable 258: postcode_shortCR6 has no variation.
print(model1_gbm)
## Stochastic Gradient Boosting 
## 
## 10499 samples
##     9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (15 fold) 
## Summary of sample sizes: 9798, 9798, 9799, 9799, 9799, 9800, ... 
## Resampling results:
## 
##   RMSE    Rsquared  MAE     
##   202546  0.849181  101379.9
## 
## Tuning parameter 'n.trees' was held constant at a value of 500
## Tuning
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.075
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 5
set.seed(69)
predictions <- predict(model1_gbm,test_data)
# Model prediction performance
gbm_results<-data.frame(  RMSE = RMSE(predictions, test_data$price), 
                            Rsquare = R2(predictions, test_data$price)
)
gbm_results
##       RMSE   Rsquare
## 1 208298.9 0.8368225

Through Gradient Boosting, I create a Machine Learning model with an adjusted R-Squared of 82.57% on the training dataset and an adjusted R-Squared of 86.57% on the testing dataset.

5.8 Least Absolute Shrinkage and Selection Operator

Next, I build a model using LASSO Regression to find the relationship between the price and the dependent variables. Lasso Regression is an L1 regularisation method that uses the concept of penalisation to shrink the regression coefficients towards zero by penalising the regression model using the L1 Norm. Through the use of a process of feature selection, Lasso Regression to reduce the model complexity and prevent overfitting. In order to find the best value of the regularisation parameter, lambda, I create a sequence of length 1000 ranging from 0 to 5000. I then build my model using the distance_to_station, water_company, property_type, latitude, longitude, postcode_short, and total_floor_area*(as.factor(london_zone)) variables.

set.seed(69)
# Using an experimental sequnce to find the optimal value of lambda 
lambda_seq <- seq(0, 5000, length = 1000)
# LASSO regression with using 15-fold cross validation to select the best lambda amongst the lambdas specified in "lambda_seq".
lasso <- train(
 price ~ distance_to_station + water_company+ property_type+latitude+ longitude+ postcode_short+ total_floor_area*(as.factor(london_zone)),
 data = train_data,
 method = "glmnet",
  preProc = c("center", "scale"), # This option standardizes the data before running the LASSO regression
  trControl = control,
  tuneGrid = expand.grid(alpha = 1, lambda = lambda_seq) # alpha=1 specifies to run a LASSO regression. 
  )
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortEC1Y,
## postcode_shortW1B, postcode_shortWC1A, postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1E, postcode_shortWC1N, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1E, postcode_shortWC2B, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortW1F,
## postcode_shortWC1A, postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1B, postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortTN16,
## postcode_shortWC1A, postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortEC4V,
## postcode_shortTW15, postcode_shortWC1A, postcode_shortWC1E, postcode_shortWC2N,
## postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortW1J,
## postcode_shortWC1A, postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortKT8,
## postcode_shortSW1Y, postcode_shortW1T, postcode_shortWC1A, postcode_shortWC1E,
## postcode_shortWC2H, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortEC4R,
## postcode_shortSW1E, postcode_shortWC1A, postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1E, postcode_shortCR6
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: postcode_shortWC1A,
## postcode_shortWC1E, postcode_shortCR6
plot(lasso)

predictions_lasso <- predict(lasso,test_data)
lasso_results<-data.frame(  RMSE = RMSE(predictions_lasso, test_data$price), 
                            Rsquare = R2(predictions_lasso, test_data$price)
)
lasso_results
##       RMSE   Rsquare
## 1 217099.4 0.8264632

I yield an adjusted R-Squared of 0.84 using the Lasso Regression model.

5.9 Random Forest

I finally use the Random Forest Machine Learning method to create an ensemble model aggregating multiple decision trees.

modelLookup("ranger")
##    model     parameter                         label forReg forClass probModel
## 1 ranger          mtry #Randomly Selected Predictors   TRUE     TRUE      TRUE
## 2 ranger     splitrule                Splitting Rule   TRUE     TRUE      TRUE
## 3 ranger min.node.size             Minimal Node Size   TRUE     TRUE      TRUE
# Define the tuning grid: tuneGrid
Gridtune= data.frame(mtry=c(10:15),
                     min.node.size = 5,
                     splitrule="variance")

set.seed(69)
model1_randomforest <- train(price ~ poly(total_floor_area,2) +average_income+ longitude+latitude+current_energy_rating+num_tube_lines +type_of_closest_station+property_type+london_zone+distance_to_station +water_company+freehold_or_leasehold , 
               data = train_data, 
               method = "ranger",
               trControl=control,
               # calculate importance
               importance="permutation", 
               tuneGrid = Gridtune,
               num.trees = 200)


varImp(model1_randomforest)
## ranger variable importance
## 
##   only 20 most important variables shown (out of 23)
## 
##                                      Overall
## poly(total_floor_area, 2)1         100.00000
## london_zone                         52.25043
## poly(total_floor_area, 2)2          22.86491
## longitude                           10.81482
## average_income                       7.20484
## latitude                             5.87083
## freehold_or_leaseholdL               3.95744
## num_tube_lines                       3.38312
## property_typeF                       2.66677
## distance_to_station                  1.38261
## property_typeS                       1.13585
## water_companyThames Water            0.99163
## property_typeT                       0.84502
## type_of_closest_stationrail          0.32242
## type_of_closest_stationtube          0.27067
## water_companySES Water               0.15116
## water_companyEssex & Suffolk Water   0.13805
## water_companyLeep Utilities          0.10378
## current_energy_ratingG               0.09879
## current_energy_ratingC               0.09302
plot(varImp(model1_randomforest))

summary(model1_randomforest)
##                           Length Class         Mode     
## predictions               10499  -none-        numeric  
## num.trees                     1  -none-        numeric  
## num.independent.variables     1  -none-        numeric  
## mtry                          1  -none-        numeric  
## min.node.size                 1  -none-        numeric  
## variable.importance          23  -none-        numeric  
## prediction.error              1  -none-        numeric  
## forest                        7  ranger.forest list     
## splitrule                     1  -none-        character
## treetype                      1  -none-        character
## r.squared                     1  -none-        numeric  
## call                          9  -none-        call     
## importance.mode               1  -none-        character
## num.samples                   1  -none-        numeric  
## replace                       1  -none-        logical  
## xNames                       23  -none-        character
## problemType                   1  -none-        character
## tuneValue                     3  data.frame    list     
## obsLevels                     1  -none-        logical  
## param                         2  -none-        list
print(model1_randomforest)
## Random Forest 
## 
## 10499 samples
##    12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (15 fold) 
## Summary of sample sizes: 9798, 9798, 9799, 9799, 9799, 9800, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   10    195304.1  0.8560306  96268.90
##   11    194447.3  0.8566175  96232.25
##   12    194363.4  0.8572595  96159.97
##   13    194057.6  0.8577990  96284.23
##   14    193346.3  0.8582430  96278.45
##   15    192981.6  0.8583826  96284.55
## 
## Tuning parameter 'splitrule' was held constant at a value of variance
## 
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 15, splitrule = variance
##  and min.node.size = 5.
predictions_rf <-predict(model1_randomforest,test_data)

# Model prediction performance
rf_results<-data.frame(  RMSE = RMSE(predictions_rf, test_data$price), 
                            Rsquare = R2(predictions_rf, test_data$price))
rf_results 
##       RMSE   Rsquare
## 1 222561.4 0.8166155

5.10 Comparing performance of all the created models

Basing on the comparison of all the models,

lr_results 
##       RMSE   Rsquare
## 1 217841.1 0.8258659
tree_results 
##       RMSE   Rsquare
## 1 273477.4 0.7491902
lasso_results 
##       RMSE   Rsquare
## 1 217099.4 0.8264632
rf_results 
##       RMSE   Rsquare
## 1 222561.4 0.8166155
gbm_results 
##       RMSE   Rsquare
## 1 208298.9 0.8368225

The best model according to our analysis upto this point is the GBM that yields an adjusted R-Squared of 86.5%.

5.11 Stacking

I now combine all of the best models that I created together to build a model that will be able to evaluate more precise predictions by stacking them together - this is an ensemble method that combines heterogeneous weak learners to create a powerful meta-model. ince the Lasso Regression and Linear Regression are highly correlated, I build my stacked learner from the Decision Tree, the Lasso Regression, the Random Forest, and the GBM.

multimodel<- list(tree=model4_tree,lasso=lasso,ranger= model1_randomforest, gbm=model1_gbm)
class(multimodel)<- "caretList"

6 Visualising results

modelCor(resamples(multimodel))
##             tree     lasso    ranger       gbm
## tree   1.0000000 0.7334402 0.9074172 0.8663864
## lasso  0.7334402 1.0000000 0.8386928 0.8086503
## ranger 0.9074172 0.8386928 1.0000000 0.8934407
## gbm    0.8663864 0.8086503 0.8934407 1.0000000
dotplot(resamples(multimodel), metric="Rsquared")

xyplot(resamples(multimodel), metric="Rsquared")

splom(resamples(multimodel), metric="Rsquared")

library(caret)
library(caretEnsemble)
## 
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
model_list<- caretStack(multimodel, #creating a model that stacks both models together
                        trControl=control,
                        method="lm",
                        metric="RMSE")


summary(model_list)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3043874   -51578     2114    48822  4055677 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.232e+04  2.942e+03  -7.584 3.64e-14 ***
## tree         9.013e-02  1.337e-02   6.741 1.66e-11 ***
## lasso        1.688e-01  1.446e-02  11.672  < 2e-16 ***
## ranger       4.616e-01  1.978e-02  23.336  < 2e-16 ***
## gbm          3.144e-01  1.491e-02  21.088  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 186100 on 10494 degrees of freedom
## Multiple R-squared:  0.8721, Adjusted R-squared:  0.872 
## F-statistic: 1.789e+04 on 4 and 10494 DF,  p-value: < 2.2e-16
predictions_stacked <- predict(model_list,test_data)
stacked_results<-data.frame(  RMSE = RMSE(predictions_stacked, test_data$price), 
                            Rsquare = R2(predictions_stacked, test_data$price)
)
stacked_results
##       RMSE   Rsquare
## 1 207708.1 0.8411913

I perform 15-fold cross-validation to confirm the performance of the stacked model and find an adjusted R-squared of 84.68% on the training set and 88.03% on the testing set.

As we can see, the Rsquared achieves the best results in comparison to our all previous models. Therefore, we will use this model as an estimation engine to guide the investment on the housing market in London.

6.0.1 Performance on test data

First I will check the preformance of our model on the test dataset.

numchoose=200 #choosing number of investments
set.seed(1)
random_mult<-1/(1-runif(nrow(test_data),min=-0.2, max=0.2))
test_data$asking_price<-test_data$price*random_mult #creating the asking_price simulation

#Assume that these are asking prices

#now predict the value of houses
test_data$predict<-predict(model_list,test_data)

#choose the ones that you want to invest here

#Let’s find the profit margin given our predicted price and asking price

test_data<-test_data %>% 
  mutate(profitMargin=(predict-asking_price)/(asking_price)) %>% 
  arrange(-profitMargin)
#Make sure you chooses exactly 200 of them
test_data$invest=0
test_data[1:numchoose,]$invest=1

#let's find the actual profit
test_data<-test_data %>% 
  mutate(profit=(price-asking_price)/(asking_price), actualProfit=invest*profit)

mean(test_data$profit)
## [1] 0.00189603
sum(test_data$actualProfit)/numchoose
## [1] 0.0552229

7 Pick investments

In this section I will use the best algorithm I identified to choose 200 properties from the out of sample data.

numchoose=200

oos<-london_house_prices_2019_out_of_sample


oos[is.na(oos[,"population"]), "population"] <- mean(oos[,"population"], na.rm = TRUE)

#predict the value of houses
oos$predict <- predict(model_list,oos)

oos<-oos %>% 
  mutate(profitMargin=(predict-asking_price)/(asking_price)) %>% 
  arrange(-profitMargin)

oos$buy=0
oos[1:numchoose,]$buy=1

oos %>% 
  filter(buy == 1) %>% 
  summarise(profit = sum(predict-asking_price))
##     profit
## 1 61307189
oos %>% 
  filter(buy == 1) %>% 
  summarise(investment = sum(asking_price))
##   investment
## 1  102673000
oos <- oos[,!(names(oos) %in% c("profitMargin"))]
  
write.csv(oos,"Jayant_Advait.csv")

The final profit on an investment of 97.2 million pounds is 58.5 million pounds!